kernel.silent(true)
import coursierapi.MavenRepository
interp.repositories() ++= Seq(MavenRepository.of("https://jitpack.io"))
import $ivy.`com.github.propi:rdfrules:1.5.0`
//import $ivy.`com.github.propi.rdfrules::core:1.0.0`
import collection._
import org.apache.jena.riot.Lang
import scala.util.control.Breaks._
import scala.collection.immutable.ListMap
import $ivy.`org.plotly-scala::plotly-almond:0.8.2`
import plotly._, plotly.element._, plotly.layout._, plotly.Almond._
init(offline=true)
repl.pprinter() = repl.pprinter().copy(defaultHeight = 3)
import com.github.propi.rdfrules.data._
import com.github.propi.rdfrules.algorithm.amie._
import com.github.propi.rdfrules.algorithm.dbscan._
import com.github.propi.rdfrules.utils._
import com.github.propi.rdfrules.index._
import com.github.propi.rdfrules.rule._
import com.github.propi.rdfrules.ruleset._
val rdfsLabel = "http://www.w3.org/2000/01/rdf-schema#label"
val rdfsComment = "http://www.w3.org/2000/01/rdf-schema#comment"
val alternateName = "http://schema.org/alternateName"
val image = "http://schema.org/image"
val rdfType = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
val rounded = (value: Double, scale: Integer) => BigDecimal(value).setScale(scale, BigDecimal.RoundingMode.HALF_UP).toDouble
// udělej to barbarsyk, co cache to dataset, to index ...
val regionTotalSlice = Dataset.fromCache("../cache/jaur/jaur-regions-total.cache")
val regionBySexSlice = Dataset.fromCache("../cache/jaur/jaur-regions-bysex.cache")
val districtTotalSlice = Dataset.fromCache("../cache/jaur/jaur-districts-total.cache")
val districtBySexSlice = Dataset.fromCache("../cache/jaur/jaur-districts-bysex.cache")
val yagoHop0 = Dataset.fromCache("../cache/yago-hop0.cache")
yagoHop0.size
val yagoHop1 = Dataset.fromCache("../cache/yago-hop1.cache")
yagoHop1.size
val yagoHop2 = Dataset.fromCache("../cache/yago-hop2.cache")
yagoHop2.size
val yagoDataset = yagoHop0 + yagoHop1 + yagoHop2
yagoDataset.size
val yagoDatasetFiltered = yagoDataset.
filter(q => !q.triple.predicate.hasSameUriAs(rdfsLabel) &&
!q.triple.predicate.hasSameUriAs(rdfsComment) &&
!q.triple.predicate.hasSameUriAs(alternateName) &&
!q.triple.predicate.hasSameUriAs(rdfType) &&
!q.triple.predicate.hasSameUriAs(image))
val ratio: Double = (yagoDatasetFiltered.size.toDouble / yagoDataset.size.toDouble)
yagoDatasetFiltered.size + " / " + yagoDataset.size + " = " + rounded(ratio,2)*100 + "%"
val refAreaLinking = Dataset("../data/linking/yagoCZSOLinking.ttl")
val regionTotalDataset = regionTotalSlice + yagoDatasetFiltered + refAreaLinking
val regionBySexDataset = regionBySexSlice + yagoDatasetFiltered + refAreaLinking
val districtTotalDataset = districtTotalSlice + yagoDatasetFiltered + refAreaLinking
val districtBySexDataset = districtBySexSlice + yagoDatasetFiltered + refAreaLinking
val regionTotalIndex = regionTotalDataset.index().cache("../cache/jaur-yago/regionTotalIndex.cache")
val regionBySexIndex = regionBySexDataset.index().cache("../cache/jaur-yago/regionBySexIndex.cache")
val districtTotalIndex = districtTotalDataset.index().cache("../cache/jaur-yago/districtTotalIndex.cache")
val districtBySexIndex = districtBySexDataset.index().cache("../cache/jaur-yago/districtBySexIndex.cache")
val regionTotalIndex = Index.fromCache("../cache/jaur-yago/regionTotalIndex.cache",false)
val regionBySexIndex = Index.fromCache("../cache/jaur-yago/regionBySexIndex.cache",false)
val districtTotalIndex = Index.fromCache("../cache/jaur-yago/districtTotalIndex.cache",false)
val districtBySexIndex = Index.fromCache("../cache/jaur-yago/districtBySexIndex.cache",false)
val uri = (value: String) => TripleItem.Uri(value)
val qbDataSet = uri("http://purl.org/linked-data/cube#dataSet")
val czsoUri = "http://data.czso.cz/ontology/"
val refArea = uri("http://data.czso.cz/ontology/refArea")
val constantsAtObject = RuleConstraint.ConstantsAtPosition.ConstantsPosition.Object
val constantsOnlyAtObject = RuleConstraint.ConstantsAtPosition(constantsAtObject)
// in all cubes
val unemploymentRate = uri(czsoUri+"podilNezamestnanych")
val reachableApplicants = uri(czsoUri+"dosazitelniNeumisteniUchazeciOZamestnani")
// only in total cubes
val unplacedApplicants = uri(czsoUri+"neumisteniUchazeciOZamestnani")
val vacaniesCount = uri(czsoUri+"pocetVolnychMist")
val measures = Array(unemploymentRate,reachableApplicants,unplacedApplicants,vacaniesCount)
val oneOfAllMeasures = OneOf(unemploymentRate,reachableApplicants,unplacedApplicants,vacaniesCount)
val oneOfBySexMeasures = OneOf(unemploymentRate,reachableApplicants)
val regionTotalSliceUri = uri("jaur-regions-total")
val regionBySexSliceUri = uri("jaur-regions-bysex")
val oneOfRegionCubes = OneOf(regionTotalSliceUri,regionBySexSliceUri)
val districtTotalSliceUri = uri("jaur-districts-total")
val districtBySexSliceUri = uri("jaur-districts-bysex")
val oneOfDistrictCubes = OneOf(districtTotalSliceUri,districtBySexSliceUri)
val regionTotalPattern = (
AtomPattern(subject = 'b', graph = uri("yago")) &:
AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
AtomPattern(subject = 'a', predicate = qbDataSet, `object` = regionTotalSliceUri, graph = uri("czso"))
=>:
AtomPattern(subject = 'a', predicate = oneOfAllMeasures, graph = uri("czso"))
)
val minSupport = (d: Dataset) => {
val n = d.filter(q => q.triple.predicate == qbDataSet).size
val nArea = d.filter(q => q.triple.predicate == refArea).triples.map(t => t.`object`).toSet.size
n / nArea
}
val regionTotalTask = Amie()
.addThreshold(Threshold.MinSupport(minSupport(regionTotalSlice)))
.addThreshold(Threshold.MaxRuleLength(6))
.addThreshold(Threshold.MinHeadSize(0))
.addConstraint(constantsOnlyAtObject)
.addPattern(regionTotalPattern)
val startTimeMillis = System.currentTimeMillis()
val regionTotalTaskRuleset = regionTotalIndex.mine(regionTotalTask)
println("rules: "+regionTotalTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")
regionTotalTaskRuleset.export("../rulesets/jaur-yago/regionTotal.txt")
val filterRuleset = (r: Ruleset) => r.filterResolved(rr => {rr.body.count(i => i.predicate == refArea) == 1})
val regionTotalTaskRulesetFiltered = filterRuleset(regionTotalTaskRuleset)
.cache
regionTotalTaskRulesetFiltered.export("../rulesets/jaur-yago/regionTotalFiltered.txt")
println("rules: " + regionTotalTaskRulesetFiltered.size)
val plotHistogram = (seq: Seq[Double], color: String) => {
val data = Seq(plotly.Histogram(seq,marker = Marker(color = Color.StringColor(color),opacity = 0.6)))
plot(data)
}
val measureSequence = (r: Ruleset, m: TypedKeyMap.Key[Measure]) => r
.resolvedRules
.map(r => r.measures.get(m).get)
.toSeq
val supportSeq = measureSequence(regionTotalTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "green")
val computeConfidence = (r: Ruleset, minConf: Double) => r
.computePcaConfidence(minConf)
.sortBy(Measure.PcaConfidence, Measure.Support)
val regionTotalTaskRulesetConfComputed = computeConfidence(regionTotalTaskRulesetFiltered,0.0).cache
regionTotalTaskRulesetConfComputed.export("../rulesets/jaur-yago/regionTotalConfComputed.txt")
val confSeq = measureSequence(regionTotalTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
plotHistogram(confSeq, "green")
val computeLift = (r: Ruleset, minLift: Double) => r
.computeLift(minLift)
.sortBy(Measure.Lift, Measure.Support)
val regionTotalTaskRulesetLiftComputed = computeLift(regionTotalTaskRulesetFiltered,0.0).cache
regionTotalTaskRulesetLiftComputed.export("../rulesets/jaur-yago/regionTotalLiftComputed.txt")
val liftSeq = measureSequence(regionTotalTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "green")
val regionTotalTaskRulesetMinLift = regionTotalTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.pruned(false,false)
.cache
println("rules: " + regionTotalTaskRulesetMinLift.size)
regionTotalTaskRulesetMinLift.export("../rulesets/jaur-yago/regionTotalMinLift.txt")
val liftSeq = measureSequence(regionTotalTaskRulesetMinLift, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "green")
val filterByLength = (r: ResolvedRule, length: Int) => r.body.size == length - 1
val groupRulesByLength = (r: Ruleset, minLength: Int, maxLength: Int) => {
var lengthMap: ListMap[Int, Int] = ListMap()
breakable {
var counter = minLength
while (counter <= maxLength) {
val size = r.filterResolved(r => filterByLength(r,counter)).size
lengthMap = lengthMap + (counter -> size)
counter = counter + 1
}
}
ListMap(lengthMap.toSeq.sortWith(_._2 > _._2):_*)
}
val plotHorizontalBar = (map: ListMap[Int, Int], color: String) => {
val xValue = map.values.toSeq ; val yValue = map.keys.toSeq
val data = Seq(Bar(xValue, yValue, orientation = Orientation.Horizontal,
marker = Marker(color = Color.StringColor(color),opacity = 0.6)))
val annotations = xValue.zip(yValue).map {
case (x, y) =>
Annotation(
x = x, y = y, text = x.toString,
xanchor = Anchor.Center, yanchor = Anchor.Bottom, showarrow = false
)
}
plot(data,Layout(annotations = annotations))
}
plotHorizontalBar(groupRulesByLength(regionTotalTaskRulesetMinLift, 4, 6), "green")
val makeClusters = (r: Ruleset, minNeighbours: Int, minSimilarity: Double) => r.makeClusters {
implicit val ruleSimilarityCounting: SimilarityCounting[Rule.Simple] = SimilarityCounting.AtomsSimilarityCounting
DbScan(minNeighbours = minNeighbours, minSimilarity = minSimilarity)
}.cache
val regionTotalTaskRulesetClustered = makeClusters(regionTotalTaskRulesetMinLift, 3, 0.85)
regionTotalTaskRulesetClustered.export("../rulesets/jaur-yago/regionTotalClustered.txt")
val filterByCluster = (r: ResolvedRule, cluster: Int) => r.measures.get(Measure.Cluster).get == Measure.Cluster(cluster)
val groupRulesByCluster = (r: Ruleset) => {
var clustersMap: ListMap[Int, Int] = ListMap()
breakable {
var counter = 0
while (true) {
val size = r.filterResolved(r => filterByCluster(r,counter)).size
if (size > 0) {clustersMap = clustersMap + (counter -> size) ; counter = counter + 1}
else break;
}
}
ListMap(clustersMap.toSeq.sortWith(_._2 > _._2):_*)
}
plotHorizontalBar(groupRulesByCluster(regionTotalTaskRulesetClustered), "green")
val regionBySexPattern = (
AtomPattern(subject = 'b', graph = uri("yago")) &:
AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
AtomPattern(subject = 'a', predicate = qbDataSet, `object` = regionBySexSliceUri, graph = uri("czso"))
=>:
AtomPattern(subject = 'a',predicate = oneOfBySexMeasures, graph = uri("czso"))
)
val regionBySexTask = Amie()
.addThreshold(Threshold.MinSupport(minSupport(regionBySexSlice)))
.addThreshold(Threshold.MaxRuleLength(6))
.addThreshold(Threshold.MinHeadSize(1))
.addConstraint(constantsOnlyAtObject)
.addPattern(regionBySexPattern)
val startTimeMillis = System.currentTimeMillis()
val regionBySexTaskRuleset = regionBySexIndex.mine(regionBySexTask)
println("rules: "+regionBySexTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")
regionBySexTaskRuleset.export("../rulesets/jaur-yago/RegionBySex.txt")
val regionBySexTaskRulesetFiltered = filterRuleset(regionBySexTaskRuleset).cache
regionBySexTaskRulesetFiltered.export("../rulesets/jaur-yago/RegionBySexFiltered.txt")
println("rules: " + regionBySexTaskRulesetFiltered.size)
val supportSeq = measureSequence(regionBySexTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "grey")
val regionBySexTaskRulesetConfComputed = computeConfidence(regionBySexTaskRulesetFiltered,0.0).cache
regionBySexTaskRulesetConfComputed.export("../rulesets/jaur-yago/RegionBySexConfComputed.txt")
val confSeq = measureSequence(regionBySexTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
plotHistogram(confSeq, "grey")
val regionBySexTaskRulesetLiftComputed = computeLift(regionBySexTaskRulesetFiltered,0.0).cache
regionBySexTaskRulesetLiftComputed.export("../rulesets/jaur-yago/RegionBySexLiftComputed.txt")
val liftSeq = measureSequence(regionBySexTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "grey")
val regionBySexTaskRulesetMinLift = regionBySexTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.pruned(false, false)
.cache
regionBySexTaskRulesetMinLift.export("../rulesets/jaur-yago/regionBySexMinLift.txt")
println("rules: " + regionBySexTaskRulesetMinLift.size)
val liftSeq = measureSequence(regionBySexTaskRulesetMinLift, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "grey")
plotHorizontalBar(groupRulesByLength(regionBySexTaskRulesetMinLift, 4, 6), "grey")
val regionBySexTaskRulesetClustered = makeClusters(regionBySexTaskRulesetMinLift, 3, 0.85)
regionBySexTaskRulesetClustered.export("../rulesets/jaur-yago/regionBySexClustered.txt")
plotHorizontalBar(groupRulesByCluster(regionBySexTaskRulesetClustered), "grey")
val districtTotalPattern = (
AtomPattern(subject = 'b', graph = uri("yago")) &:
AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
AtomPattern(subject = 'a', predicate = qbDataSet, `object` = districtTotalSliceUri, graph = uri("czso"))
=>:
AtomPattern(subject = 'a', predicate = oneOfAllMeasures, graph = uri("czso"))
)
val districtTotalTask = Amie()
.addThreshold(Threshold.MinSupport(minSupport(districtTotalSlice)*3))
.addThreshold(Threshold.MaxRuleLength(6))
.addConstraint(constantsOnlyAtObject)
.addPattern(districtTotalPattern)
val startTimeMillis = System.currentTimeMillis()
val districtTotalTaskRuleset = districtTotalIndex.mine(districtTotalTask)
println("rules: "+districtTotalTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")
districtTotalTaskRuleset.export("../rulesets/jaur-yago/districtTotal.txt")
val districtTotalTaskRulesetFiltered = filterRuleset(districtTotalTaskRuleset).cache
districtTotalTaskRulesetFiltered.export("../rulesets/jaur-yago/districtTotalFiltered.txt")
println("rules: " + districtTotalTaskRulesetFiltered.size)
val supportSeq = measureSequence(districtTotalTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "red")
val districtTotalTaskRulesetConfComputed = computeConfidence(districtTotalTaskRulesetFiltered,0.0).cache
districtTotalTaskRulesetConfComputed.export("../rulesets/jaur-yago/districtTotalConfComputed.txt")
val confSeq = measureSequence(districtTotalTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
.filter(c => c <= 1)
plotHistogram(confSeq, "red")
val districtTotalTaskRulesetLiftComputed = computeLift(districtTotalTaskRulesetFiltered,0.0).cache
districtTotalTaskRulesetLiftComputed.export("../rulesets/jaur-yago/districtTotalLiftComputed.txt")
val liftSeq = measureSequence(districtTotalTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "red")
val districtTotalTaskRulesetMinLift = districtTotalTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value > 1.0)
.pruned(false, false)
.cache
districtTotalTaskRulesetMinLift.export("../rulesets/jaur-yago/districtTotalMinLift.txt")
println("rules: " + districtTotalTaskRulesetMinLift.size)
val liftSeq = measureSequence(districtTotalTaskRulesetMinLift, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "red")
val districtTotalTaskRulesetClustered = makeClusters(districtTotalTaskRulesetMinLift, 3, 0.85)
districtTotalTaskRulesetClustered.export("../rulesets/jaur-yago/districtTotalClustered.txt")
plotHorizontalBar(groupRulesByCluster(districtTotalTaskRulesetClustered), "red")
val districtBySexPattern = (
AtomPattern(subject = 'b', graph = uri("yago")) &:
AtomPattern(subject = 'a', predicate = refArea, `object` = 'b', graph = uri("czso")) &:
AtomPattern(subject = 'a', predicate = qbDataSet, `object` = districtBySexSliceUri, graph = uri("czso"))
=>:
AtomPattern(subject = 'a', predicate = oneOfBySexMeasures, graph = uri("czso"))
)
val districtBySexTask = Amie()
.addThreshold(Threshold.MinSupport(minSupport(districtBySexSlice)*3))
.addThreshold(Threshold.MaxRuleLength(6))
.addConstraint(constantsOnlyAtObject)
.addPattern(districtBySexPattern)
val startTimeMillis = System.currentTimeMillis()
val districtBySexTaskRuleset = districtBySexIndex.mine(districtBySexTask)
println("rules: "+districtBySexTaskRuleset.size)
println("duration: " + (System.currentTimeMillis() - startTimeMillis) / 1000 + "s")
districtBySexTaskRuleset.export("../rulesets/jaur-yago/districtBySex.txt")
val districtBySexTaskRulesetFiltered = filterRuleset(districtBySexTaskRuleset).cache
districtBySexTaskRulesetFiltered.export("../rulesets/jaur-yago/districtBySexFiltered.txt")
println("rules: " + districtBySexTaskRulesetFiltered.size)
val supportSeq = measureSequence(districtBySexTaskRulesetFiltered, Measure.Support)
.map(m => m.asInstanceOf[Measure.Support].value.toDouble)
plotHistogram(supportSeq, "cls")
val districtBySexTaskRulesetConfComputed = computeConfidence(districtBySexTaskRulesetFiltered,0.0).cache
districtBySexTaskRulesetConfComputed.export("../rulesets/jaur-yago/districtBySexConfComputed.txt")
val confSeq = measureSequence(districtBySexTaskRulesetConfComputed, Measure.PcaConfidence)
.map(m => m.asInstanceOf[Measure.PcaConfidence].value)
.filter(c => c <= 1)
plotHistogram(confSeq, "cls")
// todo compute lift ...
val districtBySexTaskRulesetLiftComputed = computeLift(districtBySexTaskRulesetFiltered,0.0).cache
districtBySexTaskRulesetLiftComputed.export("../rulesets/jaur-yago/districtBySexLiftComputed.txt")
val liftSeq = measureSequence(districtBySexTaskRulesetLiftComputed, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
plotHistogram(liftSeq, "cls")
val districtBySexTaskRulesetMinLift = districtBySexTaskRulesetLiftComputed
.filter(r => r.measures.get(Measure.Lift).get.value >= 1.0)
.pruned(false, false)
.cache
districtBySexTaskRulesetMinLift.export("../rulesets/jaur-yago/districtBySexMinLift.txt")
println("rules: " + districtBySexTaskRulesetMinLift.size)
val liftSeq = measureSequence(districtBySexTaskRulesetMinLift, Measure.Lift)
.map(m => m.asInstanceOf[Measure.Lift].value)
//.filter(c => c <= 1)
plotHistogram(liftSeq, "cls")
val districtBySexTaskRulesetClustered = makeClusters(districtBySexTaskRulesetMinLift, 3, 0.85).cache
districtBySexTaskRulesetClustered.export("../rulesets/jaur-yago/districtBySexClustered.txt")
plotHorizontalBar(groupRulesByCluster(districtBySexTaskRulesetClustered), "cls")